5.2.4 Exercises

1. Find all flights that 1) Had an arrival delay of two or more hours, 2) Flew to Houston (IAH or HOU), 3) Were operated by United, American, or Delta, 4) Departed in summer (July, August, and September), 5) Arrived more than two hours late, but didn’t leave late, 6) Were delayed by at least an hour, but made up over 30 minutes in flight, 7) Departed between midnight and 6am (inclusive)

1)

filter(flights, arr_delay >= 120)
## # A tibble: 10,200 x 19
##     year month   day dep_time sched_dep_time dep_delay arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>
##  1  2013     1     1      811            630      101.     1047
##  2  2013     1     1      848           1835      853.     1001
##  3  2013     1     1      957            733      144.     1056
##  4  2013     1     1     1114            900      134.     1447
##  5  2013     1     1     1505           1310      115.     1638
##  6  2013     1     1     1525           1340      105.     1831
##  7  2013     1     1     1549           1445       64.     1912
##  8  2013     1     1     1558           1359      119.     1718
##  9  2013     1     1     1732           1630       62.     2028
## 10  2013     1     1     1803           1620      103.     2008
## # ... with 10,190 more rows, and 12 more variables: sched_arr_time <int>,
## #   arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>,
## #   origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
## #   minute <dbl>, time_hour <dttm>

2)

filter(flights, dest == "IAH" | dest == "HOU")
## # A tibble: 9,313 x 19
##     year month   day dep_time sched_dep_time dep_delay arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>
##  1  2013     1     1      517            515        2.      830
##  2  2013     1     1      533            529        4.      850
##  3  2013     1     1      623            627       -4.      933
##  4  2013     1     1      728            732       -4.     1041
##  5  2013     1     1      739            739        0.     1104
##  6  2013     1     1      908            908        0.     1228
##  7  2013     1     1     1028           1026        2.     1350
##  8  2013     1     1     1044           1045       -1.     1352
##  9  2013     1     1     1114            900      134.     1447
## 10  2013     1     1     1205           1200        5.     1503
## # ... with 9,303 more rows, and 12 more variables: sched_arr_time <int>,
## #   arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>,
## #   origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
## #   minute <dbl>, time_hour <dttm>

3)

filter(flights, carrier == "UA" | carrier == "AA" | carrier == "DL")
## # A tibble: 139,504 x 19
##     year month   day dep_time sched_dep_time dep_delay arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>
##  1  2013     1     1      517            515        2.      830
##  2  2013     1     1      533            529        4.      850
##  3  2013     1     1      542            540        2.      923
##  4  2013     1     1      554            600       -6.      812
##  5  2013     1     1      554            558       -4.      740
##  6  2013     1     1      558            600       -2.      753
##  7  2013     1     1      558            600       -2.      924
##  8  2013     1     1      558            600       -2.      923
##  9  2013     1     1      559            600       -1.      941
## 10  2013     1     1      559            600       -1.      854
## # ... with 139,494 more rows, and 12 more variables: sched_arr_time <int>,
## #   arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>,
## #   origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
## #   minute <dbl>, time_hour <dttm>

4)

filter(flights, month >= 7, month <= 9)
## # A tibble: 86,326 x 19
##     year month   day dep_time sched_dep_time dep_delay arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>
##  1  2013     7     1        1           2029      212.      236
##  2  2013     7     1        2           2359        3.      344
##  3  2013     7     1       29           2245      104.      151
##  4  2013     7     1       43           2130      193.      322
##  5  2013     7     1       44           2150      174.      300
##  6  2013     7     1       46           2051      235.      304
##  7  2013     7     1       48           2001      287.      308
##  8  2013     7     1       58           2155      183.      335
##  9  2013     7     1      100           2146      194.      327
## 10  2013     7     1      100           2245      135.      337
## # ... with 86,316 more rows, and 12 more variables: sched_arr_time <int>,
## #   arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>,
## #   origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
## #   minute <dbl>, time_hour <dttm>

5)

filter(flights, arr_delay >= 120, dep_delay <= 0)
## # A tibble: 29 x 19
##     year month   day dep_time sched_dep_time dep_delay arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>
##  1  2013     1    27     1419           1420       -1.     1754
##  2  2013    10     7     1350           1350        0.     1736
##  3  2013    10     7     1357           1359       -2.     1858
##  4  2013    10    16      657            700       -3.     1258
##  5  2013    11     1      658            700       -2.     1329
##  6  2013     3    18     1844           1847       -3.       39
##  7  2013     4    17     1635           1640       -5.     2049
##  8  2013     4    18      558            600       -2.     1149
##  9  2013     4    18      655            700       -5.     1213
## 10  2013     5    22     1827           1830       -3.     2217
## # ... with 19 more rows, and 12 more variables: sched_arr_time <int>,
## #   arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>,
## #   origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
## #   minute <dbl>, time_hour <dttm>

6)

filter(flights, dep_delay >= 60, dep_delay - arr_delay >= 30)
## # A tibble: 2,074 x 19
##     year month   day dep_time sched_dep_time dep_delay arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>
##  1  2013     1     1     1716           1545       91.     2140
##  2  2013     1     1     2205           1720      285.       46
##  3  2013     1     1     2326           2130      116.      131
##  4  2013     1     3     1503           1221      162.     1803
##  5  2013     1     3     1821           1530      171.     2131
##  6  2013     1     3     1839           1700       99.     2056
##  7  2013     1     3     1850           1745       65.     2148
##  8  2013     1     3     1923           1815       68.     2036
##  9  2013     1     3     1941           1759      102.     2246
## 10  2013     1     3     1950           1845       65.     2228
## # ... with 2,064 more rows, and 12 more variables: sched_arr_time <int>,
## #   arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>,
## #   origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
## #   minute <dbl>, time_hour <dttm>

7)

filter(flights, dep_time >= 0, dep_time <= 600)
## # A tibble: 9,344 x 19
##     year month   day dep_time sched_dep_time dep_delay arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>
##  1  2013     1     1      517            515        2.      830
##  2  2013     1     1      533            529        4.      850
##  3  2013     1     1      542            540        2.      923
##  4  2013     1     1      544            545       -1.     1004
##  5  2013     1     1      554            600       -6.      812
##  6  2013     1     1      554            558       -4.      740
##  7  2013     1     1      555            600       -5.      913
##  8  2013     1     1      557            600       -3.      709
##  9  2013     1     1      557            600       -3.      838
## 10  2013     1     1      558            600       -2.      753
## # ... with 9,334 more rows, and 12 more variables: sched_arr_time <int>,
## #   arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>,
## #   origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
## #   minute <dbl>, time_hour <dttm>

2. Another useful dplyr filtering helper is between(). What does it do? Can you use it to simplify the code needed to answer the previous challenges?

between() checks if values in a numeric vector fall between a specified range.

Using between() in previous challenges:

filter(flights, between(month, 7, 9))
## # A tibble: 86,326 x 19
##     year month   day dep_time sched_dep_time dep_delay arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>
##  1  2013     7     1        1           2029      212.      236
##  2  2013     7     1        2           2359        3.      344
##  3  2013     7     1       29           2245      104.      151
##  4  2013     7     1       43           2130      193.      322
##  5  2013     7     1       44           2150      174.      300
##  6  2013     7     1       46           2051      235.      304
##  7  2013     7     1       48           2001      287.      308
##  8  2013     7     1       58           2155      183.      335
##  9  2013     7     1      100           2146      194.      327
## 10  2013     7     1      100           2245      135.      337
## # ... with 86,316 more rows, and 12 more variables: sched_arr_time <int>,
## #   arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>,
## #   origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
## #   minute <dbl>, time_hour <dttm>
filter(flights, between(dep_time, 0, 600))
## # A tibble: 9,344 x 19
##     year month   day dep_time sched_dep_time dep_delay arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>
##  1  2013     1     1      517            515        2.      830
##  2  2013     1     1      533            529        4.      850
##  3  2013     1     1      542            540        2.      923
##  4  2013     1     1      544            545       -1.     1004
##  5  2013     1     1      554            600       -6.      812
##  6  2013     1     1      554            558       -4.      740
##  7  2013     1     1      555            600       -5.      913
##  8  2013     1     1      557            600       -3.      709
##  9  2013     1     1      557            600       -3.      838
## 10  2013     1     1      558            600       -2.      753
## # ... with 9,334 more rows, and 12 more variables: sched_arr_time <int>,
## #   arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>,
## #   origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
## #   minute <dbl>, time_hour <dttm>

3. How many flights have a missing dep_time? What other variables are missing? What might these rows represent?

filter(flights, is.na(dep_time))
## # A tibble: 8,255 x 19
##     year month   day dep_time sched_dep_time dep_delay arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>
##  1  2013     1     1       NA           1630        NA       NA
##  2  2013     1     1       NA           1935        NA       NA
##  3  2013     1     1       NA           1500        NA       NA
##  4  2013     1     1       NA            600        NA       NA
##  5  2013     1     2       NA           1540        NA       NA
##  6  2013     1     2       NA           1620        NA       NA
##  7  2013     1     2       NA           1355        NA       NA
##  8  2013     1     2       NA           1420        NA       NA
##  9  2013     1     2       NA           1321        NA       NA
## 10  2013     1     2       NA           1545        NA       NA
## # ... with 8,245 more rows, and 12 more variables: sched_arr_time <int>,
## #   arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>,
## #   origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
## #   minute <dbl>, time_hour <dttm>

8,255 flights have a missing dep_time. These flights are also missing dep_delay, arr_time, and arr_delay, suggesting that these flights did not fly anywhere and thus likely represent canceled flights.

4. Why is NA ^ 0 not missing? Why is NA | TRUE not missing? Why is FALSE & NA not missing? Can you figure out the general rule? (NA * 0 is a tricky counterexample!)

NA ^ 0
## [1] 1
NA | TRUE
## [1] TRUE
FALSE & NA
## [1] FALSE
NA * 0 
## [1] NA

NA ^ 0 is not missing because any value to the 0th power equals 1. NA | TRUE is not missing because the second condition is TRUE, thus the result is TRUE (or logic - doesn’t matter what the other condition is). FALSE & NA is not missing because the first condition is FALSE, thus the result is FALSE (and logic - FALSE and anything will always be FALSE). However, NA * 0 is NA.

The general rule is that if the value of the missing value could potentially change the outcome of the operation, the result is missing (for example, with NA * 0, for most values of the missing value, the result would be 0; however, if the missing value was Inf, then the result would be NaN).

5.3.1 Exercises

1. How could you use arrange() to sort all missing values to the start? (Hint: use is.na()).

arrange(flights, desc(is.na(dep_time)))
## # A tibble: 336,776 x 19
##     year month   day dep_time sched_dep_time dep_delay arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>
##  1  2013     1     1       NA           1630        NA       NA
##  2  2013     1     1       NA           1935        NA       NA
##  3  2013     1     1       NA           1500        NA       NA
##  4  2013     1     1       NA            600        NA       NA
##  5  2013     1     2       NA           1540        NA       NA
##  6  2013     1     2       NA           1620        NA       NA
##  7  2013     1     2       NA           1355        NA       NA
##  8  2013     1     2       NA           1420        NA       NA
##  9  2013     1     2       NA           1321        NA       NA
## 10  2013     1     2       NA           1545        NA       NA
## # ... with 336,766 more rows, and 12 more variables: sched_arr_time <int>,
## #   arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>,
## #   origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
## #   minute <dbl>, time_hour <dttm>

2. Sort flights to find the most delayed flights. Find the flights that left earliest.

Most delayed flights:

arrange(flights, desc(dep_delay))
## # A tibble: 336,776 x 19
##     year month   day dep_time sched_dep_time dep_delay arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>
##  1  2013     1     9      641            900     1301.     1242
##  2  2013     6    15     1432           1935     1137.     1607
##  3  2013     1    10     1121           1635     1126.     1239
##  4  2013     9    20     1139           1845     1014.     1457
##  5  2013     7    22      845           1600     1005.     1044
##  6  2013     4    10     1100           1900      960.     1342
##  7  2013     3    17     2321            810      911.      135
##  8  2013     6    27      959           1900      899.     1236
##  9  2013     7    22     2257            759      898.      121
## 10  2013    12     5      756           1700      896.     1058
## # ... with 336,766 more rows, and 12 more variables: sched_arr_time <int>,
## #   arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>,
## #   origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
## #   minute <dbl>, time_hour <dttm>

Flights that left earliest:

arrange(flights, dep_delay)
## # A tibble: 336,776 x 19
##     year month   day dep_time sched_dep_time dep_delay arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>
##  1  2013    12     7     2040           2123      -43.       40
##  2  2013     2     3     2022           2055      -33.     2240
##  3  2013    11    10     1408           1440      -32.     1549
##  4  2013     1    11     1900           1930      -30.     2233
##  5  2013     1    29     1703           1730      -27.     1947
##  6  2013     8     9      729            755      -26.     1002
##  7  2013    10    23     1907           1932      -25.     2143
##  8  2013     3    30     2030           2055      -25.     2213
##  9  2013     3     2     1431           1455      -24.     1601
## 10  2013     5     5      934            958      -24.     1225
## # ... with 336,766 more rows, and 12 more variables: sched_arr_time <int>,
## #   arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>,
## #   origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
## #   minute <dbl>, time_hour <dttm>

3. Sort flights to find the fastest flights.

arrange(flights, desc(distance / air_time))
## # A tibble: 336,776 x 19
##     year month   day dep_time sched_dep_time dep_delay arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>
##  1  2013     5    25     1709           1700        9.     1923
##  2  2013     7     2     1558           1513       45.     1745
##  3  2013     5    13     2040           2025       15.     2225
##  4  2013     3    23     1914           1910        4.     2045
##  5  2013     1    12     1559           1600       -1.     1849
##  6  2013    11    17      650            655       -5.     1059
##  7  2013     2    21     2355           2358       -3.      412
##  8  2013    11    17      759            800       -1.     1212
##  9  2013    11    16     2003           1925       38.       17
## 10  2013    11    16     2349           2359      -10.      402
## # ... with 336,766 more rows, and 12 more variables: sched_arr_time <int>,
## #   arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>,
## #   origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
## #   minute <dbl>, time_hour <dttm>

4. Which flights travelled the longest? Which travelled the shortest?

Traveled longest:

arrange(flights, desc(distance))
## # A tibble: 336,776 x 19
##     year month   day dep_time sched_dep_time dep_delay arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>
##  1  2013     1     1      857            900       -3.     1516
##  2  2013     1     2      909            900        9.     1525
##  3  2013     1     3      914            900       14.     1504
##  4  2013     1     4      900            900        0.     1516
##  5  2013     1     5      858            900       -2.     1519
##  6  2013     1     6     1019            900       79.     1558
##  7  2013     1     7     1042            900      102.     1620
##  8  2013     1     8      901            900        1.     1504
##  9  2013     1     9      641            900     1301.     1242
## 10  2013     1    10      859            900       -1.     1449
## # ... with 336,766 more rows, and 12 more variables: sched_arr_time <int>,
## #   arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>,
## #   origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
## #   minute <dbl>, time_hour <dttm>

Traveled shortest:

arrange(flights, distance)
## # A tibble: 336,776 x 19
##     year month   day dep_time sched_dep_time dep_delay arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>
##  1  2013     7    27       NA            106       NA        NA
##  2  2013     1     3     2127           2129       -2.     2222
##  3  2013     1     4     1240           1200       40.     1333
##  4  2013     1     4     1829           1615      134.     1937
##  5  2013     1     4     2128           2129       -1.     2218
##  6  2013     1     5     1155           1200       -5.     1241
##  7  2013     1     6     2125           2129       -4.     2224
##  8  2013     1     7     2124           2129       -5.     2212
##  9  2013     1     8     2127           2130       -3.     2304
## 10  2013     1     9     2126           2129       -3.     2217
## # ... with 336,766 more rows, and 12 more variables: sched_arr_time <int>,
## #   arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>,
## #   origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
## #   minute <dbl>, time_hour <dttm>

5.4.1 Exercises

1. Brainstorm as many ways as possible to select dep_time, dep_delay, arr_time, and arr_delay from flights.

1)

select(flights, dep_time, dep_delay, arr_time, arr_delay)
## # A tibble: 336,776 x 4
##    dep_time dep_delay arr_time arr_delay
##       <int>     <dbl>    <int>     <dbl>
##  1      517        2.      830       11.
##  2      533        4.      850       20.
##  3      542        2.      923       33.
##  4      544       -1.     1004      -18.
##  5      554       -6.      812      -25.
##  6      554       -4.      740       12.
##  7      555       -5.      913       19.
##  8      557       -3.      709      -14.
##  9      557       -3.      838       -8.
## 10      558       -2.      753        8.
## # ... with 336,766 more rows

2)

select(flights, starts_with("dep"), starts_with("arr"))
## # A tibble: 336,776 x 4
##    dep_time dep_delay arr_time arr_delay
##       <int>     <dbl>    <int>     <dbl>
##  1      517        2.      830       11.
##  2      533        4.      850       20.
##  3      542        2.      923       33.
##  4      544       -1.     1004      -18.
##  5      554       -6.      812      -25.
##  6      554       -4.      740       12.
##  7      555       -5.      913       19.
##  8      557       -3.      709      -14.
##  9      557       -3.      838       -8.
## 10      558       -2.      753        8.
## # ... with 336,766 more rows

3)

select(flights, 4, 6, 7, 9)
## # A tibble: 336,776 x 4
##    dep_time dep_delay arr_time arr_delay
##       <int>     <dbl>    <int>     <dbl>
##  1      517        2.      830       11.
##  2      533        4.      850       20.
##  3      542        2.      923       33.
##  4      544       -1.     1004      -18.
##  5      554       -6.      812      -25.
##  6      554       -4.      740       12.
##  7      555       -5.      913       19.
##  8      557       -3.      709      -14.
##  9      557       -3.      838       -8.
## 10      558       -2.      753        8.
## # ... with 336,766 more rows

2. What happens if you include the name of a variable multiple times in a select() call?

select(flights, air_time, air_time)
## # A tibble: 336,776 x 1
##    air_time
##       <dbl>
##  1     227.
##  2     227.
##  3     160.
##  4     183.
##  5     116.
##  6     150.
##  7     158.
##  8      53.
##  9     140.
## 10     138.
## # ... with 336,766 more rows

Even if you include the name of a variable multiple times in a select() call, the variable is only included once in the data frame.

3. What does the one_of() function do? Why might it be helpful in conjunction with this vector?

vars <- c("year", "month", "day", "dep_delay", "arr_delay")

The one_of() function enables you to select variables matching elements in a character vector.

For example, you could select the variables in vars from flights:

select(flights, one_of(vars))
## # A tibble: 336,776 x 5
##     year month   day dep_delay arr_delay
##    <int> <int> <int>     <dbl>     <dbl>
##  1  2013     1     1        2.       11.
##  2  2013     1     1        4.       20.
##  3  2013     1     1        2.       33.
##  4  2013     1     1       -1.      -18.
##  5  2013     1     1       -6.      -25.
##  6  2013     1     1       -4.       12.
##  7  2013     1     1       -5.       19.
##  8  2013     1     1       -3.      -14.
##  9  2013     1     1       -3.       -8.
## 10  2013     1     1       -2.        8.
## # ... with 336,766 more rows

4. Does the result of running the following code surprise you? How do the select helpers deal with case by default? How can you change that default?

select(flights, contains("TIME"))
## # A tibble: 336,776 x 6
##    dep_time sched_dep_time arr_time sched_arr_time air_time
##       <int>          <int>    <int>          <int>    <dbl>
##  1      517            515      830            819     227.
##  2      533            529      850            830     227.
##  3      542            540      923            850     160.
##  4      544            545     1004           1022     183.
##  5      554            600      812            837     116.
##  6      554            558      740            728     150.
##  7      555            600      913            854     158.
##  8      557            600      709            723      53.
##  9      557            600      838            846     140.
## 10      558            600      753            745     138.
## # ... with 336,766 more rows, and 1 more variable: time_hour <dttm>

The select helpers ignore case by default. You can change that default by setting ignore.case = FALSE, as shown below.

select(flights, contains("TIME", ignore.case = 'FALSE'))
## # A tibble: 336,776 x 0

5.5.2 Exercises

1. Currently dep_time and sched_dep_time are convenient to look at, but hard to compute with because they’re not really continuous numbers. Convert them to a more convenient representation of number of minutes since midnight.

mutate(flights, dep_time_min = ((dep_time %/% 100 * 60) + (dep_time %% 100)) %% 1440,
  sched_dep_time_min = ((sched_dep_time %/% 100 * 60) + (sched_dep_time %% 100)) %% 1440)
## # A tibble: 336,776 x 21
##     year month   day dep_time sched_dep_time dep_delay arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>
##  1  2013     1     1      517            515        2.      830
##  2  2013     1     1      533            529        4.      850
##  3  2013     1     1      542            540        2.      923
##  4  2013     1     1      544            545       -1.     1004
##  5  2013     1     1      554            600       -6.      812
##  6  2013     1     1      554            558       -4.      740
##  7  2013     1     1      555            600       -5.      913
##  8  2013     1     1      557            600       -3.      709
##  9  2013     1     1      557            600       -3.      838
## 10  2013     1     1      558            600       -2.      753
## # ... with 336,766 more rows, and 14 more variables: sched_arr_time <int>,
## #   arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>,
## #   origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
## #   minute <dbl>, time_hour <dttm>, dep_time_min <dbl>,
## #   sched_dep_time_min <dbl>

(Need to have %% 1440 at the end or otherwise midnight (2400) will be 24 * 60 = 1440, rather than 0)

2. Compare air_time with arr_time - dep_time. What do you expect to see? What do you see? What do you need to do to fix it?

f1 <- mutate(flights, diff = arr_time - dep_time)
select(f1, air_time, diff)
## # A tibble: 336,776 x 2
##    air_time  diff
##       <dbl> <int>
##  1     227.   313
##  2     227.   317
##  3     160.   381
##  4     183.   460
##  5     116.   258
##  6     150.   186
##  7     158.   358
##  8      53.   152
##  9     140.   281
## 10     138.   195
## # ... with 336,766 more rows

I would expect air_time and arr_time - dep_time to be the same, but that is not the case. This is because while air_time is in minutes, arr_time and dep_time are in HHMM or HMM format. To fix the problem, arr_time and dep_time need to be converted into continuous numbers.

4. Find the 10 most delayed flights using a ranking function. How do you want to handle ties? Carefully read the documentation for min_rank().

filter(flights, min_rank(desc(dep_delay)) <= 10)
## # A tibble: 10 x 19
##     year month   day dep_time sched_dep_time dep_delay arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>
##  1  2013     1     9      641            900     1301.     1242
##  2  2013     1    10     1121           1635     1126.     1239
##  3  2013    12     5      756           1700      896.     1058
##  4  2013     3    17     2321            810      911.      135
##  5  2013     4    10     1100           1900      960.     1342
##  6  2013     6    15     1432           1935     1137.     1607
##  7  2013     6    27      959           1900      899.     1236
##  8  2013     7    22      845           1600     1005.     1044
##  9  2013     7    22     2257            759      898.      121
## 10  2013     9    20     1139           1845     1014.     1457
## # ... with 12 more variables: sched_arr_time <int>, arr_delay <dbl>,
## #   carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## #   air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>,
## #   time_hour <dttm>

min_rank() assigns ties to the lowest rank.

5. What does 1:3 + 1:10 return? Why?

1:3 + 1:10
## Warning in 1:3 + 1:10: longer object length is not a multiple of shorter
## object length
##  [1]  2  4  6  5  7  9  8 10 12 11

Since the length of the longer vector is not a multiple of the length of the shorter vector, the entirety of the shorter vector is added to the first three elements of the longer vector, the fourth through sixth elements of the longer vector, and the seventh through ninth elements of the longer vector, but then only the first element of the shorter vector is added to the last element of the longer vector.

6. What trigonometric functions does R provide?

Cosine, sine, tangent, arc-cosine, arc-sine, arc-tangent, and the two-argument arc-tangent.

5.6.7 Exercises

1. Brainstorm at least 5 different ways to assess the typical delay characteristics of a group of flights. Consider the following scenarios: 1) A flight is 15 minutes early 50% of the time, and 15 minutes late 50% of the time. 2) A flight is always 10 minutes late. 3) A flight is 30 minutes early 50% of the time, and 30 minutes late 50% of the time. 4) 99% of the time a flight is on time. 1% of the time it’s 2 hours late. Which is more important: arrival delay or departure delay?

(delay_chars <- flights %>%
  group_by(flight) %>%
  summarize(fifteen_min_early = mean(arr_delay == -15, na.rm = TRUE), 
            fifteen_min_late = mean(arr_delay == 15, na.rm = TRUE),
            ten_min_late = mean(arr_delay == 10, na.rm = TRUE), 
            thirty_min_early = mean(arr_delay == -30, na.rm = TRUE),
            thirty_min_late = mean(arr_delay == 30, na.rm = TRUE),
            on_time = mean(arr_delay == 0, na.rm = TRUE),
            two_hr_late = mean(arr_delay == 120, na.rm = TRUE)))
## # A tibble: 3,844 x 8
##    flight fifteen_min_early fifteen_min_late ten_min_late thirty_min_early
##     <int>             <dbl>            <dbl>        <dbl>            <dbl>
##  1      1           0.0215           0.0100       0.00574          0.00574
##  2      2           0.0392           0.0196       0.               0.     
##  3      3           0.00955          0.00637      0.0159           0.0159 
##  4      4           0.0358           0.0102       0.00767          0.0128 
##  5      5           0.0123           0.00617      0.00926          0.0216 
##  6      6           0.0291           0.00485      0.00485          0.0291 
##  7      7           0.0169           0.00424      0.               0.00847
##  8      8           0.0556           0.00855      0.0214           0.     
##  9      9           0.0132           0.0132       0.0197           0.     
## 10     10           0.0164           0.0164       0.0328           0.     
## # ... with 3,834 more rows, and 3 more variables: thirty_min_late <dbl>,
## #   on_time <dbl>, two_hr_late <dbl>

1)

filter(delay_chars, fifteen_min_early == 0.5, fifteen_min_late == 0.5)
## # A tibble: 0 x 8
## # ... with 8 variables: flight <int>, fifteen_min_early <dbl>,
## #   fifteen_min_late <dbl>, ten_min_late <dbl>, thirty_min_early <dbl>,
## #   thirty_min_late <dbl>, on_time <dbl>, two_hr_late <dbl>

2)

filter(delay_chars, ten_min_late == 1)
## # A tibble: 5 x 8
##   flight fifteen_min_early fifteen_min_late ten_min_late thirty_min_early
##    <int>             <dbl>            <dbl>        <dbl>            <dbl>
## 1   2254                0.               0.           1.               0.
## 2   3656                0.               0.           1.               0.
## 3   3785                0.               0.           1.               0.
## 4   3880                0.               0.           1.               0.
## 5   5854                0.               0.           1.               0.
## # ... with 3 more variables: thirty_min_late <dbl>, on_time <dbl>,
## #   two_hr_late <dbl>

3)

filter(delay_chars, thirty_min_early == 0.5, thirty_min_late == 0.5)
## # A tibble: 0 x 8
## # ... with 8 variables: flight <int>, fifteen_min_early <dbl>,
## #   fifteen_min_late <dbl>, ten_min_late <dbl>, thirty_min_early <dbl>,
## #   thirty_min_late <dbl>, on_time <dbl>, two_hr_late <dbl>

4)

filter(delay_chars, on_time == 0.99, two_hr_late == 0.01)
## # A tibble: 0 x 8
## # ... with 8 variables: flight <int>, fifteen_min_early <dbl>,
## #   fifteen_min_late <dbl>, ten_min_late <dbl>, thirty_min_early <dbl>,
## #   thirty_min_late <dbl>, on_time <dbl>, two_hr_late <dbl>

Arrival delay is more important, since it can directly affect connecting flights (with departure delay, there is a chance that time can be made up in the air).

2. Come up with another approach that will give you the same output as not_cancelled %>% count(dest) and not_cancelled %>% count(tailnum, wt = distance) (without using count()).

not_cancelled <- flights %>% 
  filter(!is.na(dep_delay), !is.na(arr_delay))

not_cancelled %>% 
  count(dest)
## # A tibble: 104 x 2
##    dest      n
##    <chr> <int>
##  1 ABQ     254
##  2 ACK     264
##  3 ALB     418
##  4 ANC       8
##  5 ATL   16837
##  6 AUS    2411
##  7 AVL     261
##  8 BDL     412
##  9 BGR     358
## 10 BHM     269
## # ... with 94 more rows
not_cancelled %>% 
  count(tailnum, wt = distance)
## # A tibble: 4,037 x 2
##    tailnum       n
##    <chr>     <dbl>
##  1 D942DN    3418.
##  2 N0EGMQ  239143.
##  3 N10156  109664.
##  4 N102UW   25722.
##  5 N103US   24619.
##  6 N104UW   24616.
##  7 N10575  139903.
##  8 N105UW   23618.
##  9 N107US   21677.
## 10 N108UW   32070.
## # ... with 4,027 more rows

Other approach:

not_cancelled %>% 
  group_by(dest) %>%
  summarize(n = n())
## # A tibble: 104 x 2
##    dest      n
##    <chr> <int>
##  1 ABQ     254
##  2 ACK     264
##  3 ALB     418
##  4 ANC       8
##  5 ATL   16837
##  6 AUS    2411
##  7 AVL     261
##  8 BDL     412
##  9 BGR     358
## 10 BHM     269
## # ... with 94 more rows
not_cancelled %>% 
  group_by(tailnum) %>%
  summarize(wt = sum(distance))
## # A tibble: 4,037 x 2
##    tailnum      wt
##    <chr>     <dbl>
##  1 D942DN    3418.
##  2 N0EGMQ  239143.
##  3 N10156  109664.
##  4 N102UW   25722.
##  5 N103US   24619.
##  6 N104UW   24616.
##  7 N10575  139903.
##  8 N105UW   23618.
##  9 N107US   21677.
## 10 N108UW   32070.
## # ... with 4,027 more rows

3. Our definition of cancelled flights (is.na(dep_delay) | is.na(arr_delay)) is slightly suboptimal. Why? Which is the most important column?

A flight cannot arrive without departing, this dep_delay appears to be the most important column and we can define cancelled flights by is.na(dep_delay).

5. Which carrier has the worst delays? Challenge: can you disentangle the effects of bad airports vs. bad carriers? Why/why not? (Hint: think about flights %>% group_by(carrier, dest) %>% summarise(n()))

flights %>%
  group_by(carrier) %>%
  summarize(avg_delay = mean(dep_delay, na.rm = TRUE)) %>%
  arrange(desc(avg_delay))
## # A tibble: 16 x 2
##    carrier avg_delay
##    <chr>       <dbl>
##  1 F9          20.2 
##  2 EV          20.0 
##  3 YV          19.0 
##  4 FL          18.7 
##  5 WN          17.7 
##  6 9E          16.7 
##  7 B6          13.0 
##  8 VX          12.9 
##  9 OO          12.6 
## 10 UA          12.1 
## 11 MQ          10.6 
## 12 DL           9.26
## 13 AA           8.59
## 14 AS           5.80
## 15 HA           4.90
## 16 US           3.78

Carrier F9 has the worst delays.

Challenge:

flights %>% 
  group_by(carrier, dest) %>%
  summarize(avg_delay = mean(dep_delay, na.rm = TRUE)) %>%
  group_by(carrier) %>%
  summarize(carrier_spread = mad(avg_delay, na.rm = TRUE)) %>%
  arrange(desc(carrier_spread))
## # A tibble: 16 x 2
##    carrier carrier_spread
##    <chr>            <dbl>
##  1 OO               11.0 
##  2 AA                7.61
##  3 9E                7.27
##  4 VX                7.24
##  5 UA                5.25
##  6 YV                5.23
##  7 WN                4.65
##  8 DL                4.19
##  9 EV                4.18
## 10 FL                3.52
## 11 B6                3.36
## 12 MQ                3.21
## 13 US                1.54
## 14 AS                0.  
## 15 F9                0.  
## 16 HA                0.

By grouping by carrier and calculating the median absolute deviation of the mean departure delay for each carrier/destination pair, we can see which carriers experience a large amount of variation in the mean departure delays across destinations (i.e. which carriers have a high carrier_spread). Since these carriers are not consistently bad across destinations, this would indicate that bad airports are likely to blame.

6. What does the sort argument to count() do. When might you use it?

If TRUE, the sort argument to count() will sort output in descending order of n. You could use it instead of count() then arrange().

5.7.1 Exercises

1. Refer back to the lists of useful mutate and filtering functions. Describe how each operation changes when you combine it with grouping.

When you combine functions like mean(), sum(), median(), etc… with grouping, the function will be applied to each group rather than to the whole dataset.

2. Which plane (tailnum) has the worst on-time record?

flights %>% 
  group_by(tailnum) %>%
  summarize(avg_delay = mean(dep_delay, na.rm = TRUE)) %>%
  arrange(desc(avg_delay))
## # A tibble: 4,044 x 2
##    tailnum avg_delay
##    <chr>       <dbl>
##  1 N844MH       297.
##  2 N922EV       274.
##  3 N587NW       272.
##  4 N911DA       268.
##  5 N851NW       233.
##  6 N654UA       227.
##  7 N928DN       203.
##  8 N7715E       186.
##  9 N665MQ       177.
## 10 N136DL       165.
## # ... with 4,034 more rows

Plane N844MH has the worst on-time record.

3. What time of day should you fly if you want to avoid delays as much as possible?

flights %>% 
  group_by(hour) %>%
  summarize(avg_delay = mean(dep_delay, na.rm = TRUE)) %>%
  arrange(avg_delay)
## # A tibble: 20 x 2
##     hour avg_delay
##    <dbl>     <dbl>
##  1    5.     0.688
##  2    6.     1.64 
##  3    7.     1.91 
##  4    8.     4.13 
##  5    9.     4.58 
##  6   10.     6.50 
##  7   11.     7.19 
##  8   12.     8.61 
##  9   13.    11.4  
## 10   14.    13.8  
## 11   23.    14.0  
## 12   15.    16.9  
## 13   16.    18.8  
## 14   22.    18.8  
## 15   17.    21.1  
## 16   18.    21.1  
## 17   21.    24.2  
## 18   20.    24.3  
## 19   19.    24.8  
## 20    1.   NaN

You should fly at 5 am if you want to avoid delays as much as possible.

4. For each destination, compute the total minutes of delay. For each flight, compute the proportion of the total delay for its destination.

flights %>% 
  group_by(dest) %>%
  summarize(total_delay_dest = sum(arr_delay, na.rm = TRUE))
## # A tibble: 105 x 2
##    dest  total_delay_dest
##    <chr>            <dbl>
##  1 ABQ              1113.
##  2 ACK              1281.
##  3 ALB              6018.
##  4 ANC               -20.
##  5 ATL            190260.
##  6 AUS             14514.
##  7 AVL              2089.
##  8 BDL              2904.
##  9 BGR              2874.
## 10 BHM              4540.
## # ... with 95 more rows
flights %>% 
  group_by(dest) %>%
  mutate(total_delay_dest = sum(arr_delay, na.rm = TRUE),
         prop_total_delay = arr_delay / total_delay_dest) %>%
  select(dest, arr_delay, total_delay_dest, prop_total_delay)
## # A tibble: 336,776 x 4
## # Groups:   dest [105]
##    dest  arr_delay total_delay_dest prop_total_delay
##    <chr>     <dbl>            <dbl>            <dbl>
##  1 IAH         11.           30046.        0.000366 
##  2 IAH         20.           30046.        0.000666 
##  3 MIA         33.            3467.        0.00952  
##  4 BQN        -18.            7322.       -0.00246  
##  5 ATL        -25.          190260.       -0.000131 
##  6 ORD         12.           97352.        0.000123 
##  7 FLL         19.           96153.        0.000198 
##  8 IAD        -14.           74631.       -0.000188 
##  9 MCO         -8.           76185.       -0.000105 
## 10 ORD          8.           97352.        0.0000822
## # ... with 336,766 more rows

6. Look at each destination. Can you find flights that are suspiciously fast? (i.e. flights that represent a potential data entry error). Compute the air time a flight relative to the shortest flight to that destination. Which flights were most delayed in the air?

flights %>%
  group_by(dest) %>%
  arrange(air_time)
## # A tibble: 336,776 x 19
## # Groups:   dest [105]
##     year month   day dep_time sched_dep_time dep_delay arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>
##  1  2013     1    16     1355           1315       40.     1442
##  2  2013     4    13      537            527       10.      622
##  3  2013    12     6      922            851       31.     1021
##  4  2013     2     3     2153           2129       24.     2247
##  5  2013     2     5     1303           1315      -12.     1342
##  6  2013     2    12     2123           2130       -7.     2211
##  7  2013     3     2     1450           1500      -10.     1547
##  8  2013     3     8     2026           1935       51.     2131
##  9  2013     3    18     1456           1329       87.     1533
## 10  2013     3    19     2226           2145       41.     2305
## # ... with 336,766 more rows, and 12 more variables: sched_arr_time <int>,
## #   arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>,
## #   origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
## #   minute <dbl>, time_hour <dttm>
flights %>%
  group_by(dest) %>%
  mutate(rel_air_time = air_time - min(air_time, na.rm = TRUE), na.rm = TRUE) %>%
  select(dest, air_time, rel_air_time, tailnum) %>%
  arrange(desc(rel_air_time))
## # A tibble: 336,776 x 4
## # Groups:   dest [105]
##    dest  air_time rel_air_time tailnum
##    <chr>    <dbl>        <dbl> <chr>  
##  1 SFO       490.         195. N703TW 
##  2 LAX       440.         165. N178DN 
##  3 EGE       382.         163. N5DBAA 
##  4 DEN       331.         149. N578UA 
##  5 LAX       422.         147. N192DN 
##  6 LAS       399.         143. N852UA 
##  7 SFO       438.         143. N727TW 
##  8 SAN       413.         134. N794JB 
##  9 HNL       695.         133. N77066 
## 10 SFO       426.         131. N319AA 
## # ... with 336,766 more rows

Flight N703TW is most delayed in air.

7. Find all destinations that are flown by at least two carriers. Use that information to rank the carriers.

flights %>%
  group_by(dest) %>%
  filter(n_distinct(carrier) >= 2) %>%
  group_by(carrier) %>%
  summarize(num_dest = n_distinct(dest)) %>%
  arrange(desc(num_dest)) 
## # A tibble: 16 x 2
##    carrier num_dest
##    <chr>      <int>
##  1 EV            51
##  2 9E            48
##  3 UA            42
##  4 DL            39
##  5 B6            35
##  6 AA            19
##  7 MQ            19
##  8 WN            10
##  9 OO             5
## 10 US             5
## 11 VX             4
## 12 YV             3
## 13 FL             2
## 14 AS             1
## 15 F9             1
## 16 HA             1

8. For each plane, count the number of flights before the first delay of greater than 1 hour.

flights %>% 
  filter(!is.na(dep_delay)) %>% 
  group_by(tailnum) %>%
  mutate(delay_long = dep_delay > 60,
         delays_before = cumsum(delay_long)) %>%
  filter(delays_before < 1) %>%
  count(sort = TRUE)
## # A tibble: 3,817 x 2
## # Groups:   tailnum [3,817]
##    tailnum     n
##    <chr>   <int>
##  1 N952UW    215
##  2 N315NB    161
##  3 N705TW    160
##  4 N706TW    149
##  5 N961UW    139
##  6 N713TW    128
##  7 N346NB    127
##  8 N765US    122
##  9 N721TW    120
## 10 N5FAAA    117
## # ... with 3,807 more rows

7.3.4 Exercises

1. Explore the distribution of each of the x, y, and z variables in diamonds. What do you learn? Think about a diamond and how you might decide which dimension is the length, width, and depth.

ggplot(data = diamonds) +
  geom_histogram(mapping = aes(x), binwidth = 0.5)

ggplot(data = diamonds) +
  geom_histogram(mapping = aes(y), binwidth = 0.5)

ggplot(data = diamonds) +
  geom_histogram(mapping = aes(z), binwidth = 0.5)

The distribution of the x variable is between 3.5 and 9 for the most part, while the distribution of the y variable is generally between 5 and 10, and the distribution of the z variable is generally between 2.5 and 5 (although there are a few outliers in the y and z variables). The x and y variables are likely length and width since they have similar distribution domains, while the z variable is likely depth.

2. Explore the distribution of price. Do you discover anything unusual or surprising? (Hint: Carefully think about the binwidth and make sure you try a wide range of values.)

Default binwidth:

ggplot(data = diamonds) +
  geom_histogram(mapping = aes(price))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Binwidth = 50:

ggplot(data = diamonds) +
  geom_histogram(mapping = aes(price), binwidth = 50)

Binwidth = 100:

ggplot(data = diamonds) +
  geom_histogram(mapping = aes(price), binwidth = 100)

Zoom in on binwidth = 100 plot:

ggplot(data = diamonds) +
  geom_histogram(mapping = aes(price), binwidth = 100) +
  xlim(c(0,2500))
## Warning: Removed 26398 rows containing non-finite values (stat_bin).

Suprisingly, there are relatively few diamonds priced around $1500.

3. How many diamonds are 0.99 carat? How many are 1 carat? What do you think is the cause of the difference?

filter(diamonds, carat == 0.99) %>%
  count()
## # A tibble: 1 x 1
##       n
##   <int>
## 1    23
filter(diamonds, carat == 1) %>% 
  count()
## # A tibble: 1 x 1
##       n
##   <int>
## 1  1558
ggplot(data = diamonds) +
  geom_histogram(mapping = aes(carat), binwidth = 0.01) +
  coord_cartesian(xlim = c(.98, 1.01))

There are 23 0.99 carat diamonds and 1558 1 carat diamonds. This is likely because even though 0.99 and 1 are very close, a 1 carat diamond seems more valuale/desirable since it is a full carat rather than a fraction of a carat.

4. Compare and contrast coord_cartesian() vs xlim() or ylim() when zooming in on a histogram. What happens if you leave binwidth unset? What happens if you try and zoom so only half a bar shows?

ggplot(data = diamonds) +
  geom_histogram(mapping = aes(price))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

coord_cartesian():

ggplot(data = diamonds) +
  geom_histogram(mapping = aes(price)) + 
  coord_cartesian(ylim = c(0, 4000))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

x_lim():

ggplot(data = diamonds) +
  geom_histogram(mapping = aes(price)) +
  ylim(0, 4000)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 4 rows containing missing values (geom_bar).

coord_cartesian() draws the histogram first and then zooms in on the specified area, while xlim() and ylim() draw the histogram after dropping any values outside of the specified limits.

7.4.1 Exercises

1. What happens to missing values in a histogram? What happens to missing values in a bar chart? Why is there a difference?

ggplot(flights, aes(dep_delay)) +
  geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 8255 rows containing non-finite values (stat_bin).

flights %>%
  mutate(carrier = ifelse(carrier == "UA", NA, carrier)) %>%
  ggplot(aes(carrier)) +
  geom_bar()

Missing values in a histogram are removed before drawing the histogram/calculating counts for each bin. Missing values in a bar chart are placed in a separate NA category. There is a difference because histograms are used to visualize continuous variables, while bar charts are used to visualize categorical variables. In a histogram, the variable on the x axis needs to have a numeric value. Since NA values do not have a numeric value, they cannot be placed in any bin and need to be removed. In a bar chart, on the other hand, the value on the x axis does not need to be numeric - the NA values can simply be counted together as another category.

2. What does na.rm = TRUE do in mean() and sum()?

It removes the missing values before computing mean() and sum().

7.5.1.1 Exercises

1. Use what you’ve learned to improve the visualisation of the departure times of cancelled vs. non-cancelled flights.

flights %>%
  mutate(cancelled = is.na(dep_time), 
         sched_dep_time_min = ((sched_dep_time %/% 100 * 60) + (sched_dep_time %% 100)) %% 1440) %>%
  ggplot(mapping = aes(x = sched_dep_time_min, y = ..density..)) + 
  geom_freqpoly(mapping = aes(color = cancelled), binwidth = 25)

2. What variable in the diamonds dataset is most important for predicting the price of a diamond? How is that variable correlated with cut? Why does the combination of those two relationships lead to lower quality diamonds being more expensive?

ggplot(diamonds, aes(carat, price)) +
  geom_point() +
  geom_smooth()
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'

Carat size is most important for predicting the price of a diamond.

ggplot(diamonds, aes(cut, carat)) +
  geom_boxplot()

Fair and good cut diamonds generally have a larger carat size than very good and ideal diamonds, which could explain why lower quality diamonds are more expensive.

3. Install the ggstance package, and create a horizontal boxplot. How does this compare to using coord_flip()?

library(ggstance) 
## 
## Attaching package: 'ggstance'
## The following objects are masked from 'package:ggplot2':
## 
##     geom_errorbarh, GeomErrorbarh
ggplot(data = diamonds, mapping = aes(carat, cut)) + 
  geom_boxploth()

To get the same plot with coord_flip:

ggplot(data = diamonds, mapping = aes(cut, carat)) + 
  geom_boxplot() +
  coord_flip()

4. One problem with boxplots is that they were developed in an era of much smaller datasets and tend to display a prohibitively large number of “outlying values”. One approach to remedy this problem is the letter value plot. Install the lvplot package, and try using geom_lv() to display the distribution of price vs cut. What do you learn? How do you interpret the plots?

With boxplot:

ggplot(diamonds, aes(cut, price)) +
  geom_boxplot()

With lvplot:

library(lvplot)
ggplot(diamonds, aes(cut, price)) +
  geom_lv()
## Error: GeomLv was built with an incompatible version of ggproto.
## Please reinstall the package that provides this extension.

There are less outliers in the letter value plot than in the boxplot. The letter value plot also shows more quantiles than the boxplot (i.e. beyond the quartiles), which is useful for large datasets.

5. Compare and contrast geom_violin() with a facetted geom_histogram(), or a coloured geom_freqpoly(). What are the pros and cons of each method?

geom_violin():

ggplot(diamonds, aes(cut, price)) +
  geom_violin()

Faceted geom_histogram():

ggplot(diamonds, aes(price)) +
  geom_histogram() +
  facet_grid(. ~ cut)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Colored geom_freqpoly():

ggplot(diamonds, aes(price, color = cut)) +
  geom_freqpoly()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

geom-violin() and faceted geom_histogram() are useful for visualizing individual distributions, while colored geom_freqpoly() is useful for comparing distributions.

6. If you have a small dataset, it’s sometimes useful to use geom_jitter() to see the relationship between a continuous and categorical variable. The ggbeeswarm package provides a number of methods similar to geom_jitter(). List them and briefly describe what each one does.

geom_quasirandom() and geom_beeswarm() both are used to reduce overplotting by plotting points that would ordinarily overlap next to each other. geom_quasirandom() spaces the points based on either a van der Corput sequence or Tukey texturing; geom_beeswarm() uses a point-size based offset.

7.5.2.1 Exercises

1. How could you rescale the count dataset above to more clearly show the distribution of cut within colour, or colour within cut?

Distribution of cut within color:

diamonds %>% 
  count(color, cut) %>% 
  group_by(color) %>%
  mutate(prop = n / sum(n)) %>%
  ggplot(mapping = aes(color, cut)) +
  geom_tile(mapping = aes(fill = prop))

Distribution of color within cut:

diamonds %>% 
  count(color, cut) %>% 
  group_by(cut) %>% 
  mutate(prop = n / sum(n)) %>% 
  ggplot(mapping = aes(color, cut)) +
  geom_tile(mapping = aes(fill = prop))

2. Use geom_tile() together with dplyr to explore how average flight delays vary by destination and month of year. What makes the plot difficult to read? How could you improve it?

flights %>% 
  group_by(dest, month) %>%
  summarize(avg_dep_delay = mean(dep_delay, na.rm = TRUE)) %>%
  ggplot(mapping = aes(month, dest)) +
  geom_tile(mapping = aes(fill = avg_dep_delay))

The plot is difficult to read since it is unordered, but this could be improved by sorting destinations by the average flight delay, as shown below. It is also hard to differentiate between the shades of blue, so a different color scale would also help to make the plot easier to read.

flights %>%
  group_by(month, dest) %>% 
  summarize(avg_dep_delay = mean(dep_delay, na.rm = TRUE)) %>%
  ungroup() %>% 
  group_by(dest) %>% 
  mutate(num_month = n()) %>% 
  ggplot(mapping = aes(factor(month), reorder(dest, num_month))) + 
  geom_tile(mapping = aes(fill = avg_dep_delay)) +
  scale_fill_distiller(type = "div", palette = "Spectral")

3. Why is it slightly better to use aes(x = color, y = cut) rather than aes(x = cut, y = color) in the example above?

Using aes(x = color, y = cut):

diamonds %>% 
  count(color, cut) %>%  
  ggplot(mapping = aes(x = color, y = cut)) +
  geom_tile(mapping = aes(fill = n))

Using aes(x = cut, y = color):

diamonds %>% 
  count(color, cut) %>%  
  ggplot(mapping = aes(x = cut, y = color)) +
  geom_tile(mapping = aes(fill = n))

It is better to use aes(x = color, y = cut) as this will produce approximately square tiles that are easier to compare to each other (compared to the rectangles that aes(x = cut, y = color) will produce).

7.5.3.1 Exercises

1. Instead of summarising the conditional distribution with a boxplot, you could use a frequency polygon. What do you need to consider when using cut_width() vs cut_number()? How does that impact a visualisation of the 2d distribution of carat and price?

diamonds %>% 
  ggplot() +
  geom_freqpoly(mapping = aes(x = price, color = cut_width(carat, .25)))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

diamonds %>% 
  ggplot() +
  geom_freqpoly(mapping = aes(x = price, color = cut_number(carat, 10)))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Since cut_width() and cut_number() split a variable into groups, the number or width of bins chosen needs to be large enough to be able to visualize changes in distribution between groups but not too large.

2. Visualise the distribution of carat, partitioned by price.

library(ggstance) 
ggplot(data = diamonds, mapping = aes(x = price, y = cut_number(carat, 10))) + 
  geom_boxploth()

3. How does the price distribution of very large diamonds compare to small diamonds? Is it as you expect, or does it surprise you?

The price distribution for larger diamonds is more spread out. This is expected, as other factors such as color, cut, etc… may alter the pricing of large diamonds (these factors are less noticeable with smaller diamonds).

4. Combine two of the techniques you’ve learned to visualise the combined distribution of cut, carat, and price.

diamonds %>% 
  ggplot() +
  geom_boxplot(mapping = aes(x = cut, y = price, color = cut_number(carat, 5)))

diamonds %>% 
  mutate(carat_group = cut_number(carat, 10)) %>%
  group_by(cut, carat_group) %>%
  summarize(avg_price = mean(price)) %>%
  ggplot() +
  geom_tile(mapping = aes(x = cut, y = carat_group, fill = avg_price)) +
  scale_fill_distiller(type = "div", palette = "Spectral")

5. Two dimensional plots reveal outliers that are not visible in one dimensional plots. For example, some points in the plot below have an unusual combination of x and y values, which makes the points outliers even though their x and y values appear normal when examined separately. Why is a scatterplot a better display than a binned plot for this case?

Scatterplot:

ggplot(data = diamonds) +
  geom_point(mapping = aes(x = x, y = y)) +
  coord_cartesian(xlim = c(4, 11), ylim = c(4, 11))

Binned plot:

ggplot(data = diamonds) +
  geom_bin2d(mapping = aes(x = x, y = y), bins = 100) +
  coord_cartesian(xlim = c(4, 11), ylim = c(4, 11))

In the scatterplot, we can see the outliers as individual points, rather than as binned counts, making the scatterplot a better display than a binned plot for this case.